From 3788e861e545d43d5911677250c0f55d5f7ccedf Mon Sep 17 00:00:00 2001
From: Debarshi Ray <debarshir@gnome.org>
Date: Mon, 14 May 2018 11:24:27 +0200
Subject: [PATCH] CIE: Add an SSE2 version of "RGBA float" to "CIE L float"

On an Intel i7 Haswell, it now takes 0.056s to convert a 15 megapixel
buffer from "RGBA float" to "CIE L float" instead of the earlier
0.107s.

https://bugzilla.gnome.org/show_bug.cgi?id=795686
---
 extensions/CIE.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/extensions/CIE.c b/extensions/CIE.c
index c8a79b9..b67d206 100644
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -1123,6 +1123,69 @@ lab_r_to_f_sse2 (__m128 r)
   return f;
 }
 
+static void
+rgbaf_to_Lf_sse2 (const Babl *conversion, const float *src, float *dst, long samples)
+{
+  const Babl *space = babl_conversion_get_source_space (conversion);
+  const float m_1_0 = space->space.RGBtoXYZf[3] / D50_WHITE_REF_Y;
+  const float m_1_1 = space->space.RGBtoXYZf[4] / D50_WHITE_REF_Y;
+  const float m_1_2 = space->space.RGBtoXYZf[5] / D50_WHITE_REF_Y;
+  long i = 0;
+  long remainder;
+
+  if (((uintptr_t) src % 16) + ((uintptr_t) dst % 16) == 0)
+    {
+      const long    n = (samples / 4) * 4;
+      const __m128 m_1_0_v = _mm_set1_ps (m_1_0);
+      const __m128 m_1_1_v = _mm_set1_ps (m_1_1);
+      const __m128 m_1_2_v = _mm_set1_ps (m_1_2);
+
+      for ( ; i < n; i += 4)
+        {
+          __m128 rgba0 = _mm_load_ps (src);
+          __m128 rgba1 = _mm_load_ps (src + 4);
+          __m128 rgba2 = _mm_load_ps (src + 8);
+          __m128 rgba3 = _mm_load_ps (src + 12);
+
+          __m128 r = rgba0;
+          __m128 g = rgba1;
+          __m128 b = rgba2;
+          __m128 a = rgba3;
+          _MM_TRANSPOSE4_PS (r, g, b, a);
+
+          {
+            __m128 yr = _mm_add_ps (_mm_add_ps (_mm_mul_ps (m_1_0_v, r), _mm_mul_ps (m_1_1_v, g)),
+                                    _mm_mul_ps (m_1_2_v, b));
+
+            __m128 fy = lab_r_to_f_sse2 (yr);
+
+            __m128 L = _mm_sub_ps (_mm_mul_ps (_mm_set1_ps (116.0f), fy), _mm_set1_ps (16.0f));
+
+            _mm_store_ps (dst, L);
+          }
+
+          src += 16;
+          dst += 4;
+        }
+    }
+
+  remainder = samples - i;
+  while (remainder--)
+    {
+      float r = src[0];
+      float g = src[1];
+      float b = src[2];
+
+      float yr = m_1_0 * r + m_1_1 * g + m_1_2 * b;
+      float L = yr > LAB_EPSILON ? 116.0f * _cbrtf (yr) - 16 : LAB_KAPPA * yr;
+
+      dst[0] = L;
+
+      src += 4;
+      dst += 1;
+    }
+}
+
 static void
 rgbaf_to_Labaf_sse2 (const Babl *conversion, const float *src, float *dst, long samples)
 {
@@ -1411,6 +1474,13 @@ conversions (void)
         "linear", rgbaf_to_Labaf_sse2,
         NULL
       );
+
+      babl_conversion_new (
+        babl_format ("RGBA float"),
+        babl_format ("CIE L float"),
+        "linear", rgbaf_to_Lf_sse2,
+        NULL
+  );
     }
 
 #endif /* defined(USE_SSE2) */
-- 
2.30.2